library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust


Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'rmarkdown':
  method         from
  print.paged_df     
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor
library(gplots)

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess
options(stringsAsFactors = FALSE)

load sample info

sample.description <- read.csv("../input/sample.description.csv")

load reads

lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
[1] 26704    48

Filter for sporophyte samples:

sample.description <- sample.description %>% filter(str_detect(tissue, "S5|WYS"))
lcpm <- lcpm[,sample.description$sample]

Filter for genes with the highest coefficient of variation

CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))

names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
Ceric.18G076300.v2.1 Ceric.33G031700.v2.1 
          0.08763174           0.18394629 
quantile(CV, 0.23)
       23% 
0.08608846 
lcpm.filter <- lcpm[CV > quantile(CV, 0.23),]
dim(lcpm.filter)
[1] 20562    30

WGCNA wants genes in columns

lcpm.filter.t <- t(lcpm.filter)

Soft thresholding

powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 15000)
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 15000 of 20562
Warning: executing %dopar% sequentially: no parallel backend registered
   ..working on genes 15001 through 20562 of 20562
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

choose 8

softPower <- 8
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
dissTOM <- 1-TOM
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

define modules

# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit <- 2, pamRespectsDendro = FALSE,
                             minClusterSize = minModuleSize);
 ..done.
table(dynamicMods)
dynamicMods
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20 
2597 1248 1139  890  791  778  659  653  508  498  482  459  436  428  421  408  404  380  376  367 
  21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40 
 350  336  314  312  308  306  283  273  251  246  243  236  226  198  198  190  184  174  172  170 
  41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56 
 145  143  142  121  118  113  110  107   98   94   93   87   87   77   74   61 
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
        bisque4           black            blue           brown          brown4            cyan 
            110             659            1248            1139             113             428 
      darkgreen        darkgrey     darkmagenta  darkolivegreen      darkorange     darkorange2 
            336             312             198             226             306             118 
        darkred   darkslateblue   darkturquoise     floralwhite           green     greenyellow 
            350             107             314             121             791             482 
         grey60           ivory       lightcyan      lightcyan1      lightgreen      lightpink4 
            404             142             408             143             380              61 
lightsteelblue1     lightyellow         magenta          maroon   mediumpurple3    midnightblue 
            145             376             508              74             170             421 
   navajowhite2          orange      orangered4   paleturquoise  palevioletred3            pink 
             77             308             172             243              87             653 
          plum1           plum2          purple             red       royalblue     saddlebrown 
            174              98             498             778             367             251 
         salmon         salmon4         sienna3         skyblue        skyblue3       steelblue 
            436              87             198             273             184             246 
            tan        thistle1        thistle2       turquoise          violet           white 
            459              93              94            2597             236             283 
         yellow     yellowgreen 
            890             190 
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

merge similar modules

# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")

merge with correlation > 0.75

MEDissThres = 0.25
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
abline(h=MEDissThres, col = "red")

# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
 mergeCloseModules: Merging modules whose distance is less than 0.25
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 56 module eigengenes in given set.
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 31 module eigengenes in given set.
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 26 module eigengenes in given set.
   Calculating new MEs...
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 26 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs

compare pre and post merge

sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs 
table(merge$colors)

          black            blue           brown            cyan       darkgreen        darkgrey 
            659            2521            3736             428             423             775 
    darkorange2         darkred   darkslateblue           green     greenyellow          grey60 
            118             730             107            1250             482             825 
          ivory       lightcyan      lightcyan1 lightsteelblue1         magenta          maroon 
           1032            1439            1381             145             912             261 
  mediumpurple3      orangered4           plum2     saddlebrown         salmon4         skyblue 
            170            2120              98             251              87             273 
      steelblue        thistle1 
            246              93 
length(table(merge$colors))
[1] 26
median(table(merge$colors))
[1] 455

Look at modules

Which module is LFY in?

CrLFY1 <- "Ceric.33G031700.2.v2.1" # this is the primary transcript.  There is another but it is expressed at very low levels.

CrLFY2 <- "Ceric.18G076300"

module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)

module.assignment %>%
  filter(str_detect(geneID, "18G076300|33G031700"))

Interesting: they are in different modules(!). But both are quite large…

module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)

Plot eigengenes

Make sure sample info sheet is in the correct order.

rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
sample.eigen.l <- sample.eigen %>%
  mutate(gt_tissue=str_c(base_gt, "-", tissue)) %>%
  pivot_longer(starts_with("ME"), names_to = "ME")

sample.eigen.means <- sample.eigen.l %>%
  group_by(gt_tissue, ME) %>%
  summarise(value = mean(value))
`summarise()` has grouped output by 'gt_tissue'. You can override using the `.groups` argument.
sample.eigen.l %>%
  ggplot(aes(x=gt_tissue, y = value)) +
    geom_point(aes(color = tissue)) +
    geom_line(data=sample.eigen.means, group = 1, lwd=.3) + 
  facet_wrap(~ME, ncol=5) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
  scale_color_brewer(type="qual", palette = "Set3")

A heat map:

MEs.m <- as.matrix(MEs)
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")

save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_sporophyteSamples.Rdata")
---
title: "04_WGCNA"
author: "Julin Maloof"
date: "2025-02-16"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(tidyverse)
library(WGCNA)
library(gplots)
options(stringsAsFactors = FALSE)
```

load sample info
```{r}
sample.description <- read.csv("../input/sample.description.csv")
```


load reads

```{r}
lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
```

Filter for sporophyte samples:

```{r}
sample.description <- sample.description %>% filter(str_detect(tissue, "S5|WYS"))
lcpm <- lcpm[,sample.description$sample]
```


Filter for genes with the highest coefficient of variation

```{r}
CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))
```
```{r}
names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
```

```{r}
quantile(CV, 0.23)
```

```{r}
lcpm.filter <- lcpm[CV > quantile(CV, 0.23),]
dim(lcpm.filter)
```

WGCNA wants genes in columns

```{r}
lcpm.filter.t <- t(lcpm.filter)
```


Soft thresholding
```{r}
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 15000)
```

```{r}
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
```
choose 8

```{r}
softPower <- 8
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
dissTOM <- 1-TOM
```

```{r}
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)
```

define modules

```{r}
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit <- 2, pamRespectsDendro = FALSE,
                             minClusterSize = minModuleSize);
table(dynamicMods)
```

```{r}
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")
```

merge similar modules

```{r}
# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
```

merge with correlation > 0.75
```{r}
MEDissThres = 0.25
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
```

compare pre and post merge
```{r}
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
#dev.off()
```

```{r}
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs 
table(merge$colors)
length(table(merge$colors))
median(table(merge$colors))

```

## Look at modules

Which module is LFY in?

```{r}
CrLFY1 <- "Ceric.33G031700.2.v2.1" # this is the primary transcript.  There is another but it is expressed at very low levels.

CrLFY2 <- "Ceric.18G076300"

module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)

module.assignment %>%
  filter(str_detect(geneID, "18G076300|33G031700"))
```
Interesting: they are in different modules(!).  But both are quite large...

```{r}
module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)
```

Plot eigengenes

Make sure sample info sheet is in the correct order.
```{r}
rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
```

```{r}
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
```

```{r, fig.height=6, fig.width=10}
sample.eigen.l <- sample.eigen %>%
  mutate(gt_tissue=str_c(base_gt, "-", tissue)) %>%
  pivot_longer(starts_with("ME"), names_to = "ME")

sample.eigen.means <- sample.eigen.l %>%
  group_by(gt_tissue, ME) %>%
  summarise(value = mean(value))

sample.eigen.l %>%
  ggplot(aes(x=gt_tissue, y = value)) +
    geom_point(aes(color = tissue)) +
    geom_line(data=sample.eigen.means, group = 1, lwd=.3) + 
  facet_wrap(~ME, ncol=5) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
  scale_color_brewer(type="qual", palette = "Set3")
```
A heat map:

```{r, fig.height=7}
MEs.m <- as.matrix(MEs)
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")
```


```{r}
save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_sporophyteSamples.Rdata")
```

